This is a walkthrough on how to recreate the hematopoiesis visualizations from Figure 2 of our Cell Systems paper.
Load the required libraries and data. The hematopoiesis data (from Paul et al) can be downloaded here.
suppressWarnings(library(Seurat))
suppressWarnings(library(monocle))
library(swne)
library(ggplot2)
## Load monocle2 object
load("~/swne/Data/hemato_data/hemato_monocle_orig_cds.RData")
## Load counts and informative genes
counts <- ReadData("~/swne/Data/hemato_data/hemato_expr_debatched.tsv", make.sparse = T)
info.genes <- scan("~/swne/Data/hemato_data/hemato_info_genes.txt", sep = "\n", what = character())
## Load clusters
clusters.df <- read.table("~/swne/Data/hemato_data/hemato_cluster_mapping.csv", sep = ",")
clusters <- clusters.df[[2]]; names(clusters) <- clusters.df[[1]]
counts <- counts[,names(clusters)]
Filter genes and cells and make Seurat object
counts <- FilterData(counts, min.samples.frac = 0.005, trim = 0.005, min.nonzero.features = 200)
info.genes <- info.genes[info.genes %in% rownames(counts)]
dim(counts)
## [1] 8653 2730
se.obj <- CreateSeuratObject(counts)
Set cluster names, exclude lymphoid cells and dendritic cells (which are not part of the developmental trajectory), set cluster colors.
se.obj <- SetIdent(se.obj, cells.use = names(clusters), clusters)
se.obj@ident <- plyr::revalue(se.obj@ident,
c("1" = 'Ery', "2" = 'Ery', "3" = 'Ery', "4" = 'Ery', "5" = 'Ery',
"6" = 'Ery', "7" = 'MP/EP', "8" = 'MK', "9" = 'GMP', "10" = 'GMP',
"11" = 'DC', "12" = 'Bas', "13" = 'Bas', "14" = 'M', "15" = 'M',
"16" = 'Neu', "17" = 'Neu', "18" = 'Eos', "19" = 'lymphoid'))
se.obj <- SubsetData(se.obj, ident.remove = c("lymphoid", "DC"))
clusters <- se.obj@ident; names(clusters) <- se.obj@cell.names;
cluster_colors <- c("Bas" = "#ff6347", "Eos" = "#EFAD1E",
"Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859",
"GMP" = "#D097C4", "MK" = "#ACC436", "Neu" = "#F5918A")
Scale data, run PCA, t-SNE, and UMAP, and build the SNN.
## Scale data
se.obj@data <- ScaleCounts(se.obj@raw.data[,se.obj@cell.names], method = "log")
## calculating variance fit ... using gam
se.obj@scale.data <- se.obj@data - Matrix::rowMeans(se.obj@data)
## Run PCA
se.obj <- RunPCA(se.obj, pc.genes = info.genes, do.print = F, pcs.compute = 40)
PCElbowPlot(se.obj, num.pc = 40)
## Run t-SNE, UMAP, and SNN
se.obj <- RunTSNE(se.obj, dims.use = 1:15)
se.obj <- RunUMAP(se.obj, reduction.use = "pca", dims.use = 1:15)
se.obj <- BuildSNN(se.obj, dims.use = 1:15, k.param = 40, prune.SNN = 0.0, force.recalc = T)
Identify number of factors to use for SWNE
k.range <- seq(2,20,2) ## Range of factor values to test
n.cores <- 16 ## Number of cores to use
norm.counts <- se.obj@data[,names(clusters)]
k.err <- FindNumFactors(norm.counts[info.genes,], k.range = k.range, n.cores = n.cores,
seed = 32590, do.plot = F)
PlotFactorSelection(k.err, font.size = 15)
Run SWNE
k <- 12
nmf.res <- RunNMF(norm.counts[info.genes,], k = k, init = "ica", n.cores = n.cores, ica.fast = F) ## Run NMF
nmf.scores <- nmf.res$H
nmf.res$W <- ProjectFeatures(norm.counts, nmf.res$H, n.cores = n.cores) ## Project all genes onto NMF
## Run SWNE embedding
swne.embedding <- EmbedSWNE(nmf.scores, SNN = se.obj@snn, alpha.exp = 1.5, snn.exp = 0.1, n_pull = 3,
dist.use = "cosine")
## Initial stress : 0.09063
## stress after 10 iters: 0.03130, magic = 0.461
## stress after 20 iters: 0.02789, magic = 0.500
## stress after 30 iters: 0.02786, magic = 0.500
Identify biologically relevant factors and embed key marker genes. See Table S1 from the Cell Systems paper for how the factors were named and how key genes were selected.
swne.embedding <- RenameFactors(swne.embedding, name.mapping = c("factor_4" = "Erythrocyte differentiation",
"factor_5" = "Metal binding",
"factor_9" = "Epigenetic regulation",
"factor_10" = "HSC maintenance",
"factor_11" = "Inflammation"))
genes.embed <- c("Apoe", "Mt2", "Gpr56", "Sun2", "Flt3")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed, n_pull = 3)
Make SWNE plot
PlotSWNE(swne.embedding, alpha.plot = 0.4, sample.groups = clusters, do.label = T,
label.size = 3.5, pt.size = 2, show.legend = F) +
scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
Make t-SNE plot
## tSNE plots
tsne.scores <- GetCellEmbeddings(se.obj, reduction.type = "tsne")
PlotDims(tsne.scores, sample.groups = clusters, show.legend = F, show.axes = F,
alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
Make UMAP plot
umap.scores <- GetCellEmbeddings(se.obj, reduction.type = "umap")
PlotDims(umap.scores, sample.groups = clusters, show.legend = F, show.axes = F,
alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
Extract pre-computed pseudotime (computed using Monocle2)
pseudotime <- cds$Pseudotime; names(pseudotime) <- colnames(cds@reducedDimS);
pseudotime <- pseudotime[names(clusters)]
SWNE pseudotime plot
FeaturePlotSWNE(swne.embedding, pseudotime, alpha.plot = 0.4, label.size = 3.5, pt.size = 1.5)
t-SNE pseudotime plot
FeaturePlotDims(tsne.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)
UMAP pseudotime plot
FeaturePlotDims(umap.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)
Now let’s compute a quantitative evaluation of SWNE, tSNE, and UMAP as well as other embeddings.
First we need to compute those other embeddings, including dmaps, MDS, Isomap, LLE, and PCA
library(destiny)
dm <- DiffusionMap(t(as.matrix(norm.counts[info.genes,])), k = 20, n_eigs = 2)
diff.scores <- dm@eigenvectors; rownames(diff.scores) <- colnames(norm.counts);
library(RDRToolbox)
isomap.scores <- Isomap(t(as.matrix(norm.counts[info.genes,])), dims = 2, k = 20)$dim2
rownames(isomap.scores) <- colnames(norm.counts)
lle.scores <- LLE(t(as.matrix(norm.counts[info.genes,])), dim = 2, k = 20)
rownames(lle.scores) <- colnames(norm.counts)
mds.scores <- cmdscale(dist(t(as.matrix(norm.counts[info.genes,]))), k = 2)
pc.scores <- GetCellEmbeddings(se.obj, dims.use = 1:2)
embeddings <- list(swne = t(as.matrix(swne.embedding$sample.coords)),
tsne = t(tsne.scores),
pca = t(pc.scores),
lle = t(lle.scores),
mds = t(mds.scores),
isomap = t(isomap.scores),
dmaps = t(diff.scores),
umap = t(umap.scores))
Next we define some helper functions that will help us evaluate these embeddings
library(FNN)
library(proxy)
##
## Attaching package: 'proxy'
## The following object is masked from 'package:destiny':
##
## as.matrix
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
knn.idx <- knn.index(t(emb), k = k)
knn.matrix <- matrix(0, ncol(emb), ncol(emb))
for (i in 1:nrow(knn.idx)) {
knn.matrix[knn.idx[i,],i] <- 1
knn.matrix[i, knn.idx[i,]] <- 1
}
rownames(knn.matrix) <- colnames(knn.matrix) <- colnames(emb)
as(knn.matrix, "dgCMatrix")
}
## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
a <- sum(x)
b <- sum(y)
c <- sum(x == 1 & y == 1)
c/(a + b - c)
}
## Function for identifying cells in the same path and time step
GetPathStep <- function(metadata, step.size, make.factor = T) {
path.step <- as.character(metadata$Group); names(path.step) <- rownames(metadata);
for (path in levels(factor(metadata$Group))) {
steps <- sort(unique(subset(metadata, Group == path)$Step))
step.range <- seq(min(steps), max(steps), step.size)
for(i in step.range) {
cells.step <- rownames(subset(metadata, Group == path & Step %in% seq(i, i + step.size - 1, 1)))
path.step[cells.step] <- paste(path, i, sep = ".")
}
}
if (make.factor) {
path.step <- factor(path.step)
}
path.step
}
## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}
Compute how well each embedding maintains local structure compared to the original gene expression space by comparing kNN networks.
n.neighbors <- 40
ref.knn <- ComputeKNN(norm.counts[info.genes,], k = n.neighbors)
## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, ComputeKNN, k = n.neighbors)
knn.simil <- sapply(embeddings.knn, function(knn.emb) {
mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})
Compute how well each embedding maintains global strucutre by computing the centroids of each trajectory-cluster grouping in the embedding space and original gene expression space and correlating the pairwise distances between the centroids.
metadata.df <- data.frame(Group = clusters, Step = order(pseudotime[names(clusters)]))
clusters.steps <- GetPathStep(metadata.df, step.size = 50, make.factor = T)
traj.dist <- CalcPairwiseDist(norm.counts[info.genes,], clusters.steps)
embeddings.cor <- sapply(embeddings, function(emb) {
emb.dist <- CalcPairwiseDist(emb, clusters.steps)
cor(traj.dist, emb.dist)
})
Plot how well each embedding maintaings global vs local structure.
library(ggplot2)
library(ggrepel)
scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
xlab("Neighborhood Score") + ylab("Path-Time Distance Correlation") +
geom_text_repel(aes(x, y, label = name), size = 5) +
xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))